Attribution statement: 1. I did this homework by myself, with help from the book and the professor.

allSchoolsReportStatus.RData –

usVaccines.Rdata –

#reportSampleX.RData –

Introductory/Descriptive Reports

Public vs. Private School Comparisons:

Predictive Analyses:

Please find more details from the R code below,

# import libraries 
#create a function to ensure the libraries are imported
EnsurePackage <- function(x){
  x <- as.character(x)
    if (!require(x,character.only = TRUE)){
      install.packages(pkgs=x, repos = "http://cran.us.r-project.org")
      require(x, character.only = TRUE)
    }
  }
EnsurePackage("nlme")
## Loading required package: nlme
load("~/Documents/R/IST772/allSchoolsReportStatus.RData")
# data(allSchoolsReportStatus)
allSchools <- data.frame(allSchoolsReportStatus)

str(allSchools)
## 'data.frame':    7381 obs. of  3 variables:
##  $ name    : chr  "AGUA DULCE ELEMENTARY" "MEADOWLARK ELEMENTARY" "CALIFORNIA SCHOOL FOR THE DEAF-FREMONT" "HIDDEN VALLEY ELEMENTARY" ...
##  $ pubpriv : chr  "PUBLIC" "PUBLIC" "PUBLIC" "PUBLIC" ...
##  $ reported: chr  "Y" "Y" "Y" "Y" ...
summary(allSchools)
##      name             pubpriv            reported        
##  Length:7381        Length:7381        Length:7381       
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character
# View(allSchools)
dim(allSchools) #  7381    3
## [1] 7381    3
colnames(allSchools) # "name"     "pubpriv"  "reported"
## [1] "name"     "pubpriv"  "reported"
# table(allSchools$reported)
# table(allSchools$pubpriv)
# table(allSchools$isreported)
# table(allSchools$ispubpriv)

# define variables
# name <- allSchools$name
# pubpriv <- allSchools$pubpriv
# reported <- allSchools$reported
# allSchools$isreported <- as.numeric(as.factor(allSchools$reported))-1 # Fix the factor
# allSchools$ispubpriv <- as.numeric(as.factor(allSchools$pubpriv))-1 # Adjust the outcome
# head(allSchools)

allSchoolsDF <- data.frame(allSchools$name
                           ,as.numeric(as.factor(allSchools$pubpriv))-1
                           ,as.numeric(as.factor(allSchools$reported))-1,val=1,stringsAsFactors=FALSE)
colnames(allSchoolsDF) <- c("schoolName","isPublicPrivate","isReported","value")

str(allSchoolsDF)
## 'data.frame':    7381 obs. of  4 variables:
##  $ schoolName     : chr  "AGUA DULCE ELEMENTARY" "MEADOWLARK ELEMENTARY" "CALIFORNIA SCHOOL FOR THE DEAF-FREMONT" "HIDDEN VALLEY ELEMENTARY" ...
##  $ isPublicPrivate: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ isReported     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ value          : num  1 1 1 1 1 1 1 1 1 1 ...
# head(allSchoolsDF)

# allSchoolsDF[which(allSchoolsDF$schoolName=="AGUA DULCE ELEMENTARY"),]

#create a dataset for Public Schools
publicSchools <- subset(allSchoolsDF,allSchoolsDF$isPublicPrivate==1)
nrow(publicSchools)     #5732  
## [1] 5732
#create a dataset for Public Schools
privateSchools <- subset(allSchoolsDF,allSchoolsDF$isPublicPrivate==0)
nrow(privateSchools)     #1649  
## [1] 1649
# PRIVATE = 0
# PUBLIC = 1
table(allSchools$pubpriv)   
## 
## PRIVATE  PUBLIC 
##    1649    5732
table(allSchoolsDF$isPublicPrivate)
## 
##    0    1 
## 1649 5732
table(allSchools$reported)   
## 
##    N    Y 
##  400 6981
table(allSchoolsDF$isReported)
## 
##    0    1 
##  400 6981

Outlier Analysis

# install.packages("dlookr")  # note: requires version 3.5.2 or higher
EnsurePackage("dlookr")
## Loading required package: dlookr
## Loading required package: mice
## 
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
## 
##     filter
## The following objects are masked from 'package:base':
## 
##     cbind, rbind
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## 
## Attaching package: 'dlookr'
## The following object is masked from 'package:base':
## 
##     transform
diagnose_outlier(allSchoolsDF)
plot_outlier(allSchoolsDF)

Exploratory Analysis - Violin plot to unerstand the difference between Public vs Private schools reporting the cases

EnsurePackage("tidyverse")
## Loading required package: tidyverse
## ── Attaching packages ─────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.1     ✓ dplyr   1.0.2
## ✓ tidyr   1.0.2     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0
## ── Conflicts ────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::collapse() masks nlme::collapse()
## x dplyr::filter()   masks mice::filter(), stats::filter()
## x dplyr::lag()      masks stats::lag()
#data distribution
# violin plot with median points

allSchools %>%
  pivot_longer(cols=-c(name), names_to="variable", values_to="value", values_drop_na = TRUE) %>% 
  ggplot(aes(x=variable, y=value)) + geom_violin(trim=FALSE, fill='steelblue', color="orange") + 
  facet_wrap( ~ variable, scales="free") + 
  stat_summary(fun=mean, geom="point", shape=23, size=4,color="red")  + 
  theme_minimal()

Exploratory Analysis - Overall Public vs Private schools reporting the cases

# Summary table
agg <- count(allSchoolsDF, isPublicPrivate, isReported)
totSchools <- sum(agg$n)
ratio <- round(agg$n/totSchools,2)
aggDF <- data.frame(agg,ratio)
aggDF

Exploratory Analysis - Consolidated Public vs Private schools reporting the cases

EnsurePackage("gridExtra")
## Loading required package: gridExtra
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
ecols <- c("steelblue","orange")
agg_ord <- mutate(agg, 
                  isReported = reorder(isReported, -n, sum),
                  isPublicPrivate = reorder(isPublicPrivate, -n, sum))

p1 <- ggplot(agg_ord) +
      geom_col(aes(x = isPublicPrivate, y = n, fill = isReported)) +
      scale_fill_manual(values = ecols)

p2 <- ggplot(agg_ord) +
      geom_col(aes(x = isPublicPrivate, y = n, fill = isReported), position = "dodge") +
      scale_fill_manual(values = ecols)

grid.arrange(p1, p2, nrow = 1)

barplot(table(allSchoolsDF$isReported),
main="All Schools reported",
xlab="Reported",
ylab="Schools",
border="orange",
col="steelblue",
density=10
)

barplot(table(allSchools$pubpriv),
main="Public/Private Schools",
xlab="School",
ylab="Number of Schools",
border="orange",
col="steelblue",
density=10
)

1. Percentage of public schools reporting vaccination data

# total schools split
table(allSchools$pubpriv) 
## 
## PRIVATE  PUBLIC 
##    1649    5732
# 5732/7381 ( ~78%)
# PRIVATE = 0
# PUBLIC = 1

# Public Schools table
aggPublic <- count(publicSchools, isPublicPrivate, isReported)
# aggPublic
totPublicSchools <- sum(aggPublic$n)
ratioPublic <- round(aggPublic$n/totPublicSchools,2)
aggPublicDF <- data.frame(aggPublic,ratioPublic)
aggPublicDF
barplot(table(publicSchools$isReported),
main="Public Schools Reported",
xlab="Reported",
ylab="Number of Schools",
border="orange",
col="steelblue",
density=10
)

2. What proportion of private schools reported vaccination data?

# total schools split
table(allSchools$pubpriv) 
## 
## PRIVATE  PUBLIC 
##    1649    5732
# PRIVATE = 0
# PUBLIC = 1

# 1649/7381 ( ~22%)

# Private Schools table
aggPrivate <- count(privateSchools, isPublicPrivate, isReported)
# aggPrivate
totPrivateSchools <- sum(aggPrivate$n)
ratioPrivate <- round(aggPrivate$n/totPrivateSchools,2)
aggPrivateDF <- data.frame(aggPrivate,ratioPrivate)
aggPrivateDF
barplot(table(privateSchools$isReported),
main="Private Schools Reported",
xlab="Reported",
ylab="Schools",
border="orange",
col="steelblue",
density=10
)

usVaccines.RData

3. Have U.S. vaccinations rates been stable over time?

4. Are there any notable patterns in U.S. vaccinations rates over time?

Please find more details from the R code below,

# Import data
load("~/Documents/R/IST772/usVaccines.RData")
str(usVaccines)
##  Time-Series [1:38, 1:5] from 1980 to 2017: NA NA NA NA NA NA NA NA NA NA ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : NULL
##   ..$ : chr [1:5] "DTP1" "HepB_BD" "Pol3" "Hib3" ...
usVaccinesTS <- usVaccines
ts.plot(usVaccinesTS)

# View(usVaccinesTS)
EnsurePackage("mice") # missing data 
EnsurePackage("visdat") # missing data 
## Loading required package: visdat
usVaccinesDF <- data.frame(usVaccinesTS)
str(usVaccinesDF)
## 'data.frame':    38 obs. of  5 variables:
##  $ DTP1   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ HepB_BD: int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Pol3   : int  95 96 97 97 97 96 97 24 97 58 ...
##  $ Hib3   : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ MCV1   : int  86 97 97 98 98 97 97 82 98 87 ...
# View(usVaccinesDF)

# missing data 
md.pattern(usVaccinesDF, plot=FALSE)
##    Pol3 MCV1 Hib3 DTP1 HepB_BD   
## 16    1    1    1    1       1  0
## 2     1    1    1    1       0  1
## 5     1    1    1    0       0  2
## 15    1    1    0    0       0  3
##       0    0   15   20      22 57
# missing data visualization
vis_miss(usVaccinesDF)

EnsurePackage("changepoint")
## Loading required package: changepoint
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Successfully loaded changepoint package version 2.2.2
##  NOTE: Predefined penalty values changed in version 2.2.  Previous penalty values with a postfix 1 i.e. SIC1 are now without i.e. SIC and previous penalties without a postfix i.e. SIC are now with a postfix 0 i.e. SIC0. See NEWS and help files for further details.
EnsurePackage("tseries")
## Loading required package: tseries
EnsurePackage("timeSeries")
## Loading required package: timeSeries
## Loading required package: timeDate
## 
## Attaching package: 'timeDate'
## The following objects are masked from 'package:dlookr':
## 
##     kurtosis, skewness
## 
## Attaching package: 'timeSeries'
## The following object is masked from 'package:zoo':
## 
##     time<-
class(usVaccinesTS)
## [1] "mts"    "ts"     "matrix"
usVaccinesTSnew <- removeNA(usVaccinesTS)
# usVaccinesTSnew

# start(usVaccinesTSnew)
# end(usVaccinesTSnew)
# frequency(usVaccinesTSnew)
summary(usVaccinesTSnew)
##       DTP1          HepB_BD           Pol3            Hib3      
##  Min.   :97.00   Min.   :21.00   Min.   :90.00   Min.   :84.00  
##  1st Qu.:98.00   1st Qu.:50.00   1st Qu.:92.75   1st Qu.:93.00  
##  Median :98.00   Median :62.00   Median :93.00   Median :93.00  
##  Mean   :98.19   Mean   :57.94   Mean   :92.88   Mean   :92.25  
##  3rd Qu.:99.00   3rd Qu.:69.75   3rd Qu.:94.00   3rd Qu.:93.25  
##  Max.   :99.00   Max.   :74.00   Max.   :94.00   Max.   :94.00  
##       MCV1      
##  Min.   :90.00  
##  1st Qu.:91.75  
##  Median :92.00  
##  Mean   :91.75  
##  3rd Qu.:92.00  
##  Max.   :93.00
plot.ts(usVaccinesTSnew)

dusVaccines <- diff(usVaccinesTSnew)
# dusVaccines

dusMI <- dusVaccines[,2]
dSMIcp <- cpt.var(dusMI)
summary(dSMIcp)
## Created Using changepoint version 2.2.2 
## Changepoint type      : Change in variance 
## Method of analysis    : AMOC 
## Test Statistic  : Normal 
## Type of penalty       : MBIC with value, 8.124151 
## Minimum Segment Length : 2 
## Maximum no. of cpts   : 1 
## Changepoint Locations :
plot(dSMIcp,cpt.col="red",cpt.width=5)

ts.plot(usVaccinesTSnew)

plot(usVaccinesTSnew,col="blue",main="VaccinesTSnew undifferential plot")

plot(dusVaccines,col="orange",main="VaccinesTSnew differential plot")

EnsurePackage("bcp")
## Loading required package: bcp
## Loading required package: grid
# Bayesian Change point analysis
bAPcpm <- bcp(as.vector(usVaccinesTSnew))
plot(bAPcpm)

# summary(bAPcpm)

reportSample12.RData

# reportSample
load("~/Documents/R/IST772/reportSample12.RData")
str(reportSample)
## 'data.frame':    698 obs. of  13 variables:
##  $ code       : num  6019103 6975866 6116404 6120620 6004691 ...
##  $ name       : Factor w/ 5890 levels "A. E. ARNOLD ELEMENTARY",..: 4578 4991 426 5423 1370 5344 5026 3075 3591 4249 ...
##  $ pubpriv    : Factor w/ 2 levels "PRIVATE","PUBLIC": 2 1 2 2 2 2 1 2 2 2 ...
##  $ enrollment : num  32 22 137 78 127 83 27 33 18 28 ...
##  $ allvaccs   : num  90.6 100 93.4 96.2 99.2 ...
##  $ conditional: num  9.38 0 1.46 0 0 ...
##  $ medical    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ religious  : num  0 0 5.109 3.846 0.787 ...
##  $ dptMiss    : num  3.125 0 5.839 3.846 0.787 ...
##  $ polMiss    : num  3.125 0 5.839 3.846 0.787 ...
##  $ mmrMiss    : num  9.375 0 6.569 3.846 0.787 ...
##  $ hepMiss    : num  0 0 5.109 3.846 0.787 ...
##  $ varMiss    : num  0 0 5.109 3.846 0.787 ...
# head(reportSample)
# View(reportSample)
dim(reportSample)
## [1] 698  13
# summary(reportSample)
EnsurePackage("tidyverse")

#data distribution
# violin plot with median points

reportSample %>%
  pivot_longer(cols=-c(code,name,pubpriv), names_to="variable", values_to="value", values_drop_na = TRUE) %>% 
  ggplot(aes(x=variable, y=value)) + geom_violin(trim=FALSE, fill='steelblue', color="orange") + 
  facet_wrap( ~ variable, scales="free") + 
  stat_summary(fun=mean, geom="point", shape=23, size=4,color="red")  + 
  theme_minimal()

EnsurePackage("dlookr") # outlier analysis
# # #outlier analysis
plot_outlier(reportSample)

str(allSchools)
## 'data.frame':    7381 obs. of  3 variables:
##  $ name    : chr  "AGUA DULCE ELEMENTARY" "MEADOWLARK ELEMENTARY" "CALIFORNIA SCHOOL FOR THE DEAF-FREMONT" "HIDDEN VALLEY ELEMENTARY" ...
##  $ pubpriv : chr  "PUBLIC" "PUBLIC" "PUBLIC" "PUBLIC" ...
##  $ reported: chr  "Y" "Y" "Y" "Y" ...
str(allSchoolsDF)
## 'data.frame':    7381 obs. of  4 variables:
##  $ schoolName     : chr  "AGUA DULCE ELEMENTARY" "MEADOWLARK ELEMENTARY" "CALIFORNIA SCHOOL FOR THE DEAF-FREMONT" "HIDDEN VALLEY ELEMENTARY" ...
##  $ isPublicPrivate: num  1 1 1 1 1 1 1 1 1 1 ...
##  $ isReported     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ value          : num  1 1 1 1 1 1 1 1 1 1 ...
str(reportSample)
## 'data.frame':    698 obs. of  13 variables:
##  $ code       : num  6019103 6975866 6116404 6120620 6004691 ...
##  $ name       : Factor w/ 5890 levels "A. E. ARNOLD ELEMENTARY",..: 4578 4991 426 5423 1370 5344 5026 3075 3591 4249 ...
##  $ pubpriv    : Factor w/ 2 levels "PRIVATE","PUBLIC": 2 1 2 2 2 2 1 2 2 2 ...
##  $ enrollment : num  32 22 137 78 127 83 27 33 18 28 ...
##  $ allvaccs   : num  90.6 100 93.4 96.2 99.2 ...
##  $ conditional: num  9.38 0 1.46 0 0 ...
##  $ medical    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ religious  : num  0 0 5.109 3.846 0.787 ...
##  $ dptMiss    : num  3.125 0 5.839 3.846 0.787 ...
##  $ polMiss    : num  3.125 0 5.839 3.846 0.787 ...
##  $ mmrMiss    : num  9.375 0 6.569 3.846 0.787 ...
##  $ hepMiss    : num  0 0 5.109 3.846 0.787 ...
##  $ varMiss    : num  0 0 5.109 3.846 0.787 ...
# Pre-Processing

# 1. Add new column "isPublicPrivate" with boolean value as 0 or 1
reportSampleDF <- data.frame(reportSample
                           ,as.numeric(as.factor(reportSample$pubpriv))-1
                           ,val=1,stringsAsFactors=FALSE)
reportSampleDF$pubpriv=as.character(reportSampleDF$pubpriv)
reportSampleDF$name=as.character(reportSampleDF$name)
names(reportSampleDF)[14] <- "isPublicPrivate"
str(reportSampleDF)
## 'data.frame':    698 obs. of  15 variables:
##  $ code           : num  6019103 6975866 6116404 6120620 6004691 ...
##  $ name           : chr  "SELMA AVENUE ELEMENTARY" "ST. PIUS X" "BENJAMIN FRANKLIN ELEMENTARY" "UNIVERSITY PREPARATION SCHOOL AT CSU CHANNEL ISLANDS" ...
##  $ pubpriv        : chr  "PUBLIC" "PRIVATE" "PUBLIC" "PUBLIC" ...
##  $ enrollment     : num  32 22 137 78 127 83 27 33 18 28 ...
##  $ allvaccs       : num  90.6 100 93.4 96.2 99.2 ...
##  $ conditional    : num  9.38 0 1.46 0 0 ...
##  $ medical        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ religious      : num  0 0 5.109 3.846 0.787 ...
##  $ dptMiss        : num  3.125 0 5.839 3.846 0.787 ...
##  $ polMiss        : num  3.125 0 5.839 3.846 0.787 ...
##  $ mmrMiss        : num  9.375 0 6.569 3.846 0.787 ...
##  $ hepMiss        : num  0 0 5.109 3.846 0.787 ...
##  $ varMiss        : num  0 0 5.109 3.846 0.787 ...
##  $ isPublicPrivate: num  1 0 1 1 1 1 0 1 1 1 ...
##  $ val            : num  1 1 1 1 1 1 1 1 1 1 ...

Public vs. Private School Comparisons:

5. Was there any credible difference in overall reporting proportions between public and private schools?

Please find more details from the R code below,

# table(reportSampleDF$pubpriv)

barplot(table(reportSampleDF$pubpriv),
main="Public/Private Schools reported",
xlab="School",
ylab="Number of Schools",
border="orange",
col="steelblue",
density=10
)

# Public Schools table
aggSamples <- count(reportSampleDF,isPublicPrivate)
totSamples <- sum(aggSamples$n)
ratioPublic <- round(aggSamples$n/totSamples,2)
aggSamplesDF <- data.frame(aggSamples,ratioPublic)
aggSamplesDF
#box plot to compare the tensions
boxplot(allvaccs~pubpriv, data=reportSampleDF,
       border="orange", 
       col="steelblue",
       freq=FALSE,
       xlab="",
       las=1, 
       breaks=10,
       # notch = TRUE,
       horizontal = FALSE ,main=" vaccination rates (allvaccs) between public and private schools")

Public vs. Private School Comparisons:

6. Compare overall vaccination rates (allvaccs) between public and private schools. Are there any credible differences?

Please find more details from the R code below,

#box plot to compare the tensions
boxplot(allvaccs~pubpriv, data=reportSampleDF,
       border="orange", 
       col="steelblue",
       freq=FALSE,
       xlab="",
       las=1, 
       breaks=10,
       # notch = TRUE,
       horizontal = FALSE ,main=" vaccination rates (allvaccs) between public and private schools")

EnsurePackage("zoo")
reportSampleDF$allvaccs <- na.aggregate(reportSampleDF$allvaccs)
sum(is.na(reportSampleDF$allvaccs))
## [1] 0
# Confidence Interval Test
t.test(allvaccs~pubpriv, data=reportSampleDF)
## 
##  Welch Two Sample t-test
## 
## data:  allvaccs by pubpriv
## t = -3.7023, df = 179.7, p-value = 0.000284
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -8.939025 -2.723307
## sample estimates:
## mean in group PRIVATE  mean in group PUBLIC 
##              84.45064              90.28181
# Bayesian distribution
EnsurePackage("BEST")
## Loading required package: BEST
## Loading required package: HDInterval
bayesAllvaccs <- BESTmcmc(reportSampleDF$allvaccs[reportSampleDF$pubpriv=="PUBLIC"]
                          ,reportSampleDF$allvaccs[reportSampleDF$pubpriv=="PRIVATE"]) 
## Waiting for parallel processing to complete...
## done.
bayesAllvaccs
diffBest <- bayesAllvaccs$mu1 - bayesAllvaccs$mu2
# summary(diffBest)
quantile(diffBest, c(0.025, 0.5, 0.975))
##      2.5%       50%     97.5% 
## 0.8043082 3.2188500 5.8351416
#Bayesian probability distribution of the differences between two means
plot(bayesAllvaccs)
abline(v=quantile(diffBest,0.025),col="blue")  # low end HDI differences in proportion
abline(v=quantile(diffBest,0.975),col="green") # high end HDI differences in proportion
abline(v=quantile(diffBest,0.50),col="red") # median  in proportion

plotAll(bayesAllvaccs)

Public vs. Private School Comparisons:

7. Compare medical exemptions between public and private schools. Are there any credible differences?

Please find more details from the R code below,

#box plot to compare the tensions
boxplot(medical~pubpriv, data=reportSampleDF,
       border="orange", 
       col="steelblue",
       freq=FALSE,
       xlab="",
       las=1, 
       # breaks=10,
       # notch = TRUE,
       horizontal = FALSE ,main=" medical excemption between public and private schools")

EnsurePackage("zoo")
reportSampleDF$medical <- na.aggregate(reportSampleDF$medical)
sum(is.na(reportSampleDF$medical))
## [1] 0
# Confidence Interval Test
t.test(medical~pubpriv, data=reportSampleDF)
## 
##  Welch Two Sample t-test
## 
## data:  medical by pubpriv
## t = 1.3321, df = 159.92, p-value = 0.1847
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.0870064  0.4475860
## sample estimates:
## mean in group PRIVATE  mean in group PUBLIC 
##             0.3169112             0.1366214
# Bayesian distribution
EnsurePackage("BEST")
bayesmedical <- BESTmcmc(reportSampleDF$medical[reportSampleDF$pubpriv=="PUBLIC"]
                         ,reportSampleDF$medical[reportSampleDF$pubpriv=="PRIVATE"]) 
## Waiting for parallel processing to complete...done.
bayesmedical
bayesmedicalDiffBest <- bayesmedical$mu1 - bayesmedical$mu2
# summary(diffBest)
quantile(bayesmedicalDiffBest, c(0.025, 0.5, 0.975))
##          2.5%           50%         97.5% 
## -1.190542e-04  1.793326e-07  1.185549e-04
#Bayesian probability distribution of the differences between two means
plot(bayesmedical)
abline(v=quantile(bayesmedicalDiffBest,0.025),col="blue")  # low end HDI differences in proportion
abline(v=quantile(bayesmedicalDiffBest,0.975),col="green") # high end HDI differences in proportion
abline(v=quantile(bayesmedicalDiffBest,0.50),col="red") # median  in proportion

plotAll(bayesmedical)

Public vs. Private School Comparisons:

8. Compare religious/belief exemptions between public and private schools. Are there any credible differences?

Please find more details from the R code below,

#box plot to compare the tensions
boxplot(religious~pubpriv, data=reportSampleDF,
       border="orange", 
       col="steelblue",
       freq=FALSE,
       xlab="",
       las=1, 
       breaks=10,
       # notch = TRUE,
       horizontal = FALSE ,main=" religious belief between public and private schools")

EnsurePackage("zoo")
reportSampleDF$religious <- na.aggregate(reportSampleDF$religious)
sum(is.na(reportSampleDF$religious))
## [1] 0
# Confidence Interval Test
t.test(religious~pubpriv, data=reportSampleDF)
## 
##  Welch Two Sample t-test
## 
## data:  religious by pubpriv
## t = 1.9052, df = 178.55, p-value = 0.05836
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.09940198  5.66035299
## sample estimates:
## mean in group PRIVATE  mean in group PUBLIC 
##              6.623709              3.843233
# Bayesian distribution
EnsurePackage("BEST")
bayesreligious <- BESTmcmc(reportSampleDF$religious[reportSampleDF$pubpriv=="PUBLIC"]
                           ,reportSampleDF$religious[reportSampleDF$pubpriv=="PRIVATE"]) 
## Waiting for parallel processing to complete...done.
bayesreligious
bayesreligiousDiffBest <- bayesreligious$mu1 - bayesreligious$mu2
# summary(diffBest)
quantile(bayesreligiousDiffBest, c(0.025, 0.5, 0.975))
##      2.5%       50%     97.5% 
## 0.4322757 0.5843983 0.7481946
#Bayesian probability distribution of the differences between two means
plot(bayesreligious)
abline(v=quantile(bayesreligiousDiffBest,0.025),col="blue")  # low end HDI differences in proportion
abline(v=quantile(bayesreligiousDiffBest,0.975),col="green") # high end HDI differences in proportion
abline(v=quantile(bayesreligiousDiffBest,0.50),col="red") # median  in proportion

plotAll(bayesreligious)

summary(reportSampleDF)
##       code             name             pubpriv            enrollment    
##  Min.   : 100362   Length:698         Length:698         Min.   : 10.00  
##  1st Qu.:6015501   Class :character   Class :character   1st Qu.: 42.25  
##  Median :6043137   Mode  :character   Mode  :character   Median : 75.00  
##  Mean   :5588410                                         Mean   : 76.11  
##  3rd Qu.:6116305                                         3rd Qu.:105.00  
##  Max.   :7103161                                         Max.   :221.00  
##                                                                          
##     allvaccs       conditional        medical          religious      
##  Min.   :  8.00   Min.   : 0.000   Min.   : 0.0000   Min.   :  0.000  
##  1st Qu.: 86.17   1st Qu.: 0.000   1st Qu.: 0.0000   1st Qu.:  0.000  
##  Median : 93.33   Median : 3.030   Median : 0.0000   Median :  1.087  
##  Mean   : 89.01   Mean   : 6.931   Mean   : 0.1759   Mean   :  4.449  
##  3rd Qu.: 97.46   3rd Qu.: 8.677   3rd Qu.: 0.0000   3rd Qu.:  4.353  
##  Max.   :100.00   Max.   :81.818   Max.   :14.2857   Max.   :162.500  
##                                                                       
##     dptMiss          polMiss          mmrMiss          hepMiss      
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 1.699   1st Qu.: 1.493   1st Qu.: 1.429   1st Qu.: 0.722  
##  Median : 4.918   Median : 4.587   Median : 5.000   Median : 3.073  
##  Mean   : 8.638   Mean   : 8.229   Mean   : 8.793   Mean   : 5.909  
##  3rd Qu.:10.960   3rd Qu.:10.526   3rd Qu.:11.111   3rd Qu.: 7.767  
##  Max.   :81.250   Max.   :81.250   Max.   :81.250   Max.   :79.070  
##                   NA's   :1        NA's   :1                        
##     varMiss       isPublicPrivate       val   
##  Min.   : 0.000   Min.   :0.0000   Min.   :1  
##  1st Qu.: 0.000   1st Qu.:1.0000   1st Qu.:1  
##  Median : 2.485   Median :1.0000   Median :1  
##  Mean   : 5.371   Mean   :0.7822   Mean   :1  
##  3rd Qu.: 6.667   3rd Qu.:1.0000   3rd Qu.:1  
##  Max.   :79.070   Max.   :1.0000   Max.   :1  
## 

Predictive Analysis:

9. Is it possible to predict whether a school is public or private based on conditional, medical, and religious percentages? If so, what are the specifics?

Please find more details from the R code below,

# dim(reportSampleDF)
# Create glm model
options(scipen=999)  # turn-off scientific notation like 1e+48

school.glm <- glm(isPublicPrivate ~ conditional + medical + religious ,data=reportSampleDF,family = binomial(link = "logit")) 
summary(school.glm)
## 
## Call:
## glm(formula = isPublicPrivate ~ conditional + medical + religious, 
##     family = binomial(link = "logit"), data = reportSampleDF)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8860   0.6083   0.6360   0.6857   1.4426  
## 
## Coefficients:
##              Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)  1.593445   0.122455  13.012 < 0.0000000000000002 ***
## conditional -0.025150   0.007917  -3.177              0.00149 ** 
## medical     -0.188283   0.091933  -2.048              0.04056 *  
## religious   -0.017045   0.006992  -2.438              0.01477 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 731.59  on 697  degrees of freedom
## Residual deviance: 712.46  on 694  degrees of freedom
## AIC: 720.46
## 
## Number of Fisher Scoring iterations: 4
hist(residuals(school.glm))

# Create lm model
options(scipen=999)  # turn-off scientific notation like 1e+48

school.lm <- lm(isPublicPrivate ~ conditional + medical + religious ,data=reportSampleDF)
summary(school.lm)
## 
## Call:
## lm(formula = isPublicPrivate ~ conditional + medical + religious, 
##     data = reportSampleDF)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.8385  0.1615  0.1809  0.2143  0.6133 
## 
## Coefficients:
##              Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)  0.838508   0.019762  42.431 < 0.0000000000000002 ***
## conditional -0.004863   0.001479  -3.289              0.00106 ** 
## medical     -0.039131   0.017109  -2.287              0.02249 *  
## religious   -0.003525   0.001306  -2.700              0.00711 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4074 on 694 degrees of freedom
## Multiple R-squared:  0.03138,    Adjusted R-squared:  0.02719 
## F-statistic: 7.494 on 3 and 694 DF,  p-value: 0.00006093
EnsurePackage("VIF")
## Loading required package: VIF
EnsurePackage("car")
## Loading required package: car
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:VIF':
## 
##     vif
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:purrr':
## 
##     some
vif(school.lm)
## conditional     medical   religious 
##    1.001352    1.000499    1.000863
vif(school.glm)
## conditional     medical   religious 
##    1.004026    1.001795    1.002371
EnsurePackage("MCMCpack")
## Loading required package: MCMCpack
## Loading required package: coda
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## ##
## ## Markov Chain Monte Carlo Package (MCMCpack)
## ## Copyright (C) 2003-2020 Andrew D. Martin, Kevin M. Quinn, and Jong Hee Park
## ##
## ## Support provided by the U.S. National Science Foundation
## ## (Grants SES-0350646 and SES-0350613)
## ##
#histogram on residuals chile.glm
hist(residuals(school.glm))

#Chi-Square analysis on logistic regression
anova(school.glm, test="Chisq")
# Convert the log odds for the coefficient on the predictor into regular odds
exp(coef(school.glm))
## (Intercept) conditional     medical   religious 
##   4.9206691   0.9751638   0.8283803   0.9830997
#confusion matrix
table(round(predict(school.glm,type="response")),reportSampleDF$isPublicPrivate)
##    
##       0   1
##   0   6   4
##   1 146 542
EnsurePackage("MASS") # AIC
stepOut <- stepAIC(school.glm)
## Start:  AIC=720.46
## isPublicPrivate ~ conditional + medical + religious
## 
##               Df Deviance    AIC
## <none>             712.46 720.46
## - medical      1   716.82 722.82
## - religious    1   718.55 724.55
## - conditional  1   722.10 728.10
stepOut$anova
#bayesian estimation of logistic regression
options(scipen=999)  # turn-off scientific notation like 1e+48
EnsurePackage("BayesFactor")
## Loading required package: BayesFactor
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## ************
## Welcome to BayesFactor 0.9.12-4.2. If you have questions, please contact Richard Morey (richarddmorey@gmail.com).
## 
## Type BFManual() to open the manual.
## ************
schoolMCMCout <- lmBF(isPublicPrivate ~ conditional + medical + religious ,data=reportSampleDF, posterior=TRUE, iterations=10000)
summary(schoolMCMCout)
## 
## Iterations = 1:10000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 10000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##                  Mean       SD   Naive SE Time-series SE
## mu           0.782291 0.015447 0.00015447     0.00015447
## conditional -0.004708 0.001450 0.00001450     0.00001450
## medical     -0.037980 0.016760 0.00016760     0.00016760
## religious   -0.003414 0.001297 0.00001297     0.00001320
## sig2         0.165941 0.008886 0.00008886     0.00009047
## g            0.077273 0.116194 0.00116194     0.00116194
## 
## 2. Quantiles for each variable:
## 
##                  2.5%       25%       50%       75%      97.5%
## mu           0.752232  0.771905  0.782344  0.792765  0.8128309
## conditional -0.007581 -0.005678 -0.004701 -0.003738 -0.0018760
## medical     -0.070183 -0.049255 -0.038062 -0.026418 -0.0050365
## religious   -0.005981 -0.004269 -0.003425 -0.002545 -0.0008822
## sig2         0.149463  0.159814  0.165602  0.171783  0.1842068
## g            0.014333  0.029616  0.047666  0.083318  0.3227107
# schoolMCMCout
rsqList <- 1 - (schoolMCMCout[,"sig2"] / var(reportSampleDF$isPublicPrivate))
MeanrsqList <- mean(rsqList)
MeanrsqList
## [1] 0.02724264
quantile(rsqList,c(0.025))
##        2.5% 
## -0.07983559
quantile(rsqList,c(0.975))
##     97.5% 
## 0.1238336
# Bayes factor analysis
schoolMCMCoutBF <- lmBF(isPublicPrivate ~ conditional + medical + religious ,data=reportSampleDF)
summary(schoolMCMCoutBF)
## Bayes factor analysis
## --------------
## [1] conditional + medical + religious : 80.08021 ±0.01%
## 
## Against denominator:
##   Intercept only 
## ---
## Bayes factor type: BFlinearModel, JZS
# histogram of 95% HDI mean differences on conditional | overlaps with 0
hist(schoolMCMCout[,"conditional"],col="#CBB43D")
abline(v=quantile(schoolMCMCout[,"conditional"],c(0.025)),col="blue")
abline(v=quantile(schoolMCMCout[,"conditional"],c(0.975)),col="green")

# histogram of 95% HDI mean differences on medical | overlaps with 0
hist(schoolMCMCout[,"medical"],col="#4DAFD4")
abline(v=quantile(schoolMCMCout[,"medical"],c(0.025)),col="blue")
abline(v=quantile(schoolMCMCout[,"medical"],c(0.975)),col="green")

# histogram of 95% HDI mean differences on religious | overlaps with 0
hist(schoolMCMCout[,"religious"],col="pink")
abline(v=quantile(schoolMCMCout[,"religious"],c(0.025)),col="blue")
abline(v=quantile(schoolMCMCout[,"religious"],c(0.975)),col="green")

Predictive Analysis:

10. Is it possible to predict conditional percentage, based on the percentages of specific vaccines that are missing? If so, what are the specifics

Please find more details from the R code below,

# Pre-Processing
EnsurePackage("mice") # missing data 
EnsurePackage("visdat") # missing data 
EnsurePackage("zoo")

summary(reportSampleDF)
##       code             name             pubpriv            enrollment    
##  Min.   : 100362   Length:698         Length:698         Min.   : 10.00  
##  1st Qu.:6015501   Class :character   Class :character   1st Qu.: 42.25  
##  Median :6043137   Mode  :character   Mode  :character   Median : 75.00  
##  Mean   :5588410                                         Mean   : 76.11  
##  3rd Qu.:6116305                                         3rd Qu.:105.00  
##  Max.   :7103161                                         Max.   :221.00  
##                                                                          
##     allvaccs       conditional        medical          religious      
##  Min.   :  8.00   Min.   : 0.000   Min.   : 0.0000   Min.   :  0.000  
##  1st Qu.: 86.17   1st Qu.: 0.000   1st Qu.: 0.0000   1st Qu.:  0.000  
##  Median : 93.33   Median : 3.030   Median : 0.0000   Median :  1.087  
##  Mean   : 89.01   Mean   : 6.931   Mean   : 0.1759   Mean   :  4.449  
##  3rd Qu.: 97.46   3rd Qu.: 8.677   3rd Qu.: 0.0000   3rd Qu.:  4.353  
##  Max.   :100.00   Max.   :81.818   Max.   :14.2857   Max.   :162.500  
##                                                                       
##     dptMiss          polMiss          mmrMiss          hepMiss      
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 1.699   1st Qu.: 1.493   1st Qu.: 1.429   1st Qu.: 0.722  
##  Median : 4.918   Median : 4.587   Median : 5.000   Median : 3.073  
##  Mean   : 8.638   Mean   : 8.229   Mean   : 8.793   Mean   : 5.909  
##  3rd Qu.:10.960   3rd Qu.:10.526   3rd Qu.:11.111   3rd Qu.: 7.767  
##  Max.   :81.250   Max.   :81.250   Max.   :81.250   Max.   :79.070  
##                   NA's   :1        NA's   :1                        
##     varMiss       isPublicPrivate       val   
##  Min.   : 0.000   Min.   :0.0000   Min.   :1  
##  1st Qu.: 0.000   1st Qu.:1.0000   1st Qu.:1  
##  Median : 2.485   Median :1.0000   Median :1  
##  Mean   : 5.371   Mean   :0.7822   Mean   :1  
##  3rd Qu.: 6.667   3rd Qu.:1.0000   3rd Qu.:1  
##  Max.   :79.070   Max.   :1.0000   Max.   :1  
## 
# missing data 
md.pattern(reportSampleDF, plot=FALSE)
##     code name pubpriv enrollment allvaccs conditional medical religious dptMiss
## 696    1    1       1          1        1           1       1         1       1
## 1      1    1       1          1        1           1       1         1       1
## 1      1    1       1          1        1           1       1         1       1
##        0    0       0          0        0           0       0         0       0
##     hepMiss varMiss isPublicPrivate val polMiss mmrMiss  
## 696       1       1               1   1       1       1 0
## 1         1       1               1   1       1       0 1
## 1         1       1               1   1       0       1 1
##           0       0               0   0       1       1 2
# missing data visualization
vis_miss(reportSampleDF)

#replace missing values
reportSampleDF$polMiss <- na.aggregate(reportSampleDF$polMiss)
sum(is.na(reportSampleDF$polMiss))
## [1] 0
reportSampleDF$mmrMiss <- na.aggregate(reportSampleDF$mmrMiss)
sum(is.na(reportSampleDF$mmrMiss))
## [1] 0
options(scipen=999)  # turn-off scientific notation like 1e+48
# summary(reportSampleDF)
# str(reportSampleDF)
# Crete lm model
conditional.lm <- lm(conditional ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF) 
summary(conditional.lm)
## 
## Call:
## lm(formula = conditional ~ dptMiss + polMiss + mmrMiss + varMiss, 
##     data = reportSampleDF)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -18.797  -1.952  -1.042   0.553  42.043 
## 
## Coefficients:
##             Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)  1.04211    0.27139    3.84             0.000134 ***
## dptMiss      0.65200    0.07326    8.90 < 0.0000000000000002 ***
## polMiss      0.08391    0.07992    1.05             0.294117    
## mmrMiss      0.71589    0.04923   14.54 < 0.0000000000000002 ***
## varMiss     -1.25271    0.04485  -27.93 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.565 on 693 degrees of freedom
## Multiple R-squared:  0.7176, Adjusted R-squared:  0.716 
## F-statistic: 440.2 on 4 and 693 DF,  p-value: < 0.00000000000000022
options(scipen=999)  # turn-off scientific notation like 1e+48
# summary(reportSampleDF)
# str(reportSampleDF)

# Crete glm model

# conditional.glm <- glm(conditional ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF,family = binomial(link = "logit")) 

## Error in eval(family$initialize) : y values must be 0 <= y <= 1
EnsurePackage("VIF")
EnsurePackage("car")
vif(conditional.lm)
##   dptMiss   polMiss   mmrMiss   varMiss 
## 14.262805 16.377165  7.110362  3.513100
# vif(conditional.glm)
EnsurePackage("MCMCpack")
#histogram on residuals chile.glm
hist(residuals(conditional.lm))

#Chi-Square analysis on logistic regression
anova(conditional.lm, test="Chisq")
# Convert the log odds for the coefficient on the predictor into regular odds
exp(coef(conditional.lm))
## (Intercept)     dptMiss     polMiss     mmrMiss     varMiss 
##    2.835203    1.919370    1.087528    2.046004    0.285728
EnsurePackage("MASS") # AIC
stepOut_conditional <- stepAIC(conditional.lm)
## Start:  AIC=2401.29
## conditional ~ dptMiss + polMiss + mmrMiss + varMiss
## 
##           Df Sum of Sq   RSS    AIC
## - polMiss  1      34.1 21498 2400.4
## <none>                 21464 2401.3
## - dptMiss  1    2453.3 23917 2474.8
## - mmrMiss  1    6549.0 28013 2585.2
## - varMiss  1   24161.2 45625 2925.6
## 
## Step:  AIC=2400.4
## conditional ~ dptMiss + mmrMiss + varMiss
## 
##           Df Sum of Sq   RSS    AIC
## <none>                 21498 2400.4
## - dptMiss  1    6058.6 27557 2571.7
## - mmrMiss  1    7955.0 29453 2618.2
## - varMiss  1   24614.9 46113 2931.1
stepOut_conditional$anova
# Bayesian regression with Posterior iterations
options(scipen=999)  # turn-off scientific notation like 1e+48
EnsurePackage("BayesFactor")

MCMCout_conditional <- lmBF(conditional ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF, posterior=TRUE, iterations=10000)
summary(MCMCout_conditional)
## 
## Iterations = 1:10000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 10000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##             Mean      SD  Naive SE Time-series SE
## mu       6.92979 0.21334 0.0021334      0.0021334
## dptMiss  0.65122 0.07321 0.0007321      0.0007321
## polMiss  0.08319 0.08035 0.0008035      0.0008035
## mmrMiss  0.71320 0.04949 0.0004949      0.0004825
## varMiss -1.24892 0.04463 0.0004463      0.0004463
## sig2    31.12577 1.69441 0.0169441      0.0173560
## g        0.88361 1.37190 0.0137190      0.0137190
## 
## 2. Quantiles for each variable:
## 
##             2.5%      25%      50%     75%   97.5%
## mu       6.51638  6.78795  6.93113  7.0726  7.3419
## dptMiss  0.50857  0.60160  0.65080  0.7002  0.7941
## polMiss -0.07649  0.02818  0.08234  0.1370  0.2420
## mmrMiss  0.61480  0.68008  0.71371  0.7462  0.8095
## varMiss -1.33578 -1.27952 -1.24881 -1.2180 -1.1639
## sig2    27.94425 29.95871 31.06933 32.2215 34.5952
## g        0.20309  0.39927  0.59911  0.9847  3.1671
rsqList <- 1 - (MCMCout_conditional[,"sig2"] / var(reportSampleDF$conditional))
MeanrsqList <- mean(rsqList)
MeanrsqList
## [1] 0.7145511
quantile(rsqList,c(0.025))
##      2.5% 
## 0.6827336
quantile(rsqList,c(0.975))
##     97.5% 
## 0.7437283
# Bayes factor analysis
conditionalMCMCoutBF <- lmBF(conditional ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF)
summary(conditionalMCMCoutBF)
## Bayes factor analysis
## --------------
## [1] dptMiss + polMiss + mmrMiss + varMiss : 44389270741483073102670811534531134153720679431604137680479811655382002050441444874667850084091485973521762455768514182868334091493893173260895732465272807068522715246876676770274213888 ±0%
## 
## Against denominator:
##   Intercept only 
## ---
## Bayes factor type: BFlinearModel, JZS
# histogram of 95% HDI mean differences on dptMiss | overlaps with 0
hist(MCMCout_conditional[,"dptMiss"],col="#CBB43D")
abline(v=quantile(MCMCout_conditional[,"dptMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_conditional[,"dptMiss"],c(0.975)),col="green")

# histogram of 95% HDI mean differences on polMiss | overlaps with 0
hist(MCMCout_conditional[,"polMiss"],col="#4DAFD4")
abline(v=quantile(MCMCout_conditional[,"polMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_conditional[,"polMiss"],c(0.975)),col="green")

# histogram of 95% HDI mean differences on mmrMiss | overlaps with 0
hist(MCMCout_conditional[,"mmrMiss"],col="pink")
abline(v=quantile(MCMCout_conditional[,"mmrMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_conditional[,"mmrMiss"],c(0.975)),col="green")

# histogram of 95% HDI mean differences on varMiss | overlaps with 0
hist(MCMCout_conditional[,"varMiss"],col="orange")
abline(v=quantile(MCMCout_conditional[,"varMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_conditional[,"varMiss"],c(0.975)),col="green")

Predictive Analysis:

11. Is it possible to predict medical percentage, based on the percentages of specific vaccines that are missing? If so, what are the specifics?

Please find more details from the R code below,

options(scipen=999)  # turn-off scientific notation like 1e+48
# summary(reportSampleDF)
# str(reportSampleDF)
# Crete lm model
medical.lm <- lm(medical ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF) 
summary(medical.lm)
## 
## Call:
## lm(formula = medical ~ dptMiss + polMiss + mmrMiss + varMiss, 
##     data = reportSampleDF)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.0651 -0.1806 -0.1368 -0.1202 13.9291 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  0.124270   0.043760   2.840  0.00465 **
## dptMiss      0.006927   0.011813   0.586  0.55781   
## polMiss     -0.014374   0.012886  -1.115  0.26504   
## mmrMiss      0.001800   0.007938   0.227  0.82069   
## varMiss      0.017546   0.007232   2.426  0.01552 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8974 on 693 degrees of freedom
## Multiple R-squared:  0.01608,    Adjusted R-squared:  0.0104 
## F-statistic: 2.831 on 4 and 693 DF,  p-value: 0.02392
EnsurePackage("VIF")
EnsurePackage("car")
vif(medical.lm)
##   dptMiss   polMiss   mmrMiss   varMiss 
## 14.262805 16.377165  7.110362  3.513100
EnsurePackage("MASS") # AIC
stepOut_medical <- stepAIC(medical.lm)
## Start:  AIC=-146.17
## medical ~ dptMiss + polMiss + mmrMiss + varMiss
## 
##           Df Sum of Sq    RSS     AIC
## - mmrMiss  1    0.0414 558.11 -148.12
## - dptMiss  1    0.2769 558.34 -147.83
## - polMiss  1    1.0020 559.07 -146.92
## <none>                 558.07 -146.17
## - varMiss  1    4.7397 562.81 -142.27
## 
## Step:  AIC=-148.12
## medical ~ dptMiss + polMiss + varMiss
## 
##           Df Sum of Sq    RSS     AIC
## - dptMiss  1    0.3362 558.44 -149.70
## - polMiss  1    0.9906 559.10 -148.88
## <none>                 558.11 -148.12
## - varMiss  1    4.9939 563.10 -143.90
## 
## Step:  AIC=-149.7
## medical ~ polMiss + varMiss
## 
##           Df Sum of Sq    RSS     AIC
## - polMiss  1    1.0145 559.46 -150.43
## <none>                 558.44 -149.70
## - varMiss  1    5.5927 564.04 -144.75
## 
## Step:  AIC=-150.43
## medical ~ varMiss
## 
##           Df Sum of Sq    RSS     AIC
## <none>                 559.46 -150.43
## - varMiss  1    7.7258 567.18 -142.86
stepOut_medical$anova
EnsurePackage("MCMCpack")
#histogram on residuals medical.lm
hist(residuals(medical.lm))

#Chi-Square analysis on logistic regression
anova(medical.lm, test="Chisq")
# Convert the log odds for the coefficient on the predictor into regular odds
exp(coef(medical.lm))
## (Intercept)     dptMiss     polMiss     mmrMiss     varMiss 
##   1.1323218   1.0069507   0.9857287   1.0018016   1.0177005
#confusion matrix
# table(round(predict(medical.lm,type="response")),reportSampleDF$medical)
options(scipen=999)  # turn-off scientific notation like 1e+48
EnsurePackage("BayesFactor")

MCMCout_medical <- lmBF(medical ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF, posterior=TRUE, iterations=10000)
summary(MCMCout_medical)
## 
## Iterations = 1:10000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 10000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##              Mean       SD   Naive SE Time-series SE
## mu       0.176513 0.034189 0.00034189     0.00036380
## dptMiss  0.006635 0.011378 0.00011378     0.00011378
## polMiss -0.013715 0.012598 0.00012598     0.00012598
## mmrMiss  0.001726 0.007681 0.00007681     0.00007681
## varMiss  0.016715 0.007136 0.00007136     0.00007104
## sig2     0.803768 0.043340 0.00043340     0.00043612
## g        0.049418 0.069260 0.00069260     0.00069260
## 
## 2. Quantiles for each variable:
## 
##              2.5%       25%       50%       75%   97.5%
## mu       0.109806  0.153174  0.176351  0.199960 0.24327
## dptMiss -0.015334 -0.001110  0.006665  0.014246 0.02944
## polMiss -0.038650 -0.022041 -0.013680 -0.005383 0.01132
## mmrMiss -0.013355 -0.003397  0.001663  0.006854 0.01689
## varMiss  0.002626  0.011880  0.016714  0.021498 0.03072
## sig2     0.723826  0.773756  0.802510  0.832234 0.89318
## g        0.011025  0.021784  0.033495  0.055239 0.17953
rsqList <- 1 - (MCMCout_medical[,"sig2"] / var(reportSampleDF$medical))
MeanrsqList <- mean(rsqList)
MeanrsqList
## [1] 0.01226551
quantile(rsqList,c(0.025))
##        2.5% 
## -0.09760797
quantile(rsqList,c(0.975))
##     97.5% 
## 0.1105045
# Bayes Factor Analysis
medicalBF <- lmBF(medical ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF)
summary(medicalBF)
## Bayes factor analysis
## --------------
## [1] dptMiss + polMiss + mmrMiss + varMiss : 0.07607594 ±0.01%
## 
## Against denominator:
##   Intercept only 
## ---
## Bayes factor type: BFlinearModel, JZS
# histogram of 95% HDI mean differences on dptMiss | overlaps with 0
hist(MCMCout_medical[,"dptMiss"],col="#CBB43D")
abline(v=quantile(MCMCout_medical[,"dptMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_medical[,"dptMiss"],c(0.975)),col="green")

# histogram of 95% HDI mean differences on polMiss | overlaps with 0
hist(MCMCout_medical[,"polMiss"],col="#4DAFD4")
abline(v=quantile(MCMCout_medical[,"polMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_medical[,"polMiss"],c(0.975)),col="green")

# histogram of 95% HDI mean differences on mmrMiss | overlaps with 0
hist(MCMCout_medical[,"mmrMiss"],col="pink")
abline(v=quantile(MCMCout_medical[,"mmrMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_medical[,"mmrMiss"],c(0.975)),col="green")

# histogram of 95% HDI mean differences on varMiss | overlaps with 0
hist(MCMCout_medical[,"varMiss"],col="orange")
abline(v=quantile(MCMCout_medical[,"varMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_medical[,"varMiss"],c(0.975)),col="green")

Predictive Analyses:

12. Is it possible to predict religious percentage, based on the percentages of specific vaccines that are missing? If so, what are the specifics?

Please find more details from the R code below,

#multiple linear regression
options(scipen=999)  # turn-off scientific notation like 1e+48
# summary(reportSampleDF)
# str(reportSampleDF)
# Crete lm model
religious.lm <- lm(religious ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF) 
summary(religious.lm)
## 
## Call:
## lm(formula = religious ~ dptMiss + polMiss + mmrMiss + varMiss, 
##     data = reportSampleDF)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.639  -0.762   0.282   0.698 104.436 
## 
## Coefficients:
##             Estimate Std. Error t value            Pr(>|t|)    
## (Intercept) -0.24661    0.41567  -0.593              0.5532    
## dptMiss     -0.24685    0.11221  -2.200              0.0281 *  
## polMiss      0.22793    0.12240   1.862              0.0630 .  
## mmrMiss     -0.04702    0.07541  -0.624              0.5331    
## varMiss      0.99891    0.06870  14.541 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.524 on 693 degrees of freedom
## Multiple R-squared:  0.483,  Adjusted R-squared:  0.4801 
## F-statistic: 161.9 on 4 and 693 DF,  p-value: < 0.00000000000000022
EnsurePackage("VIF")
EnsurePackage("car")
vif(religious.lm)
##   dptMiss   polMiss   mmrMiss   varMiss 
## 14.262805 16.377165  7.110362  3.513100
EnsurePackage("MASS") # AIC
stepOut_religious <- stepAIC(religious.lm)
## Start:  AIC=2996.45
## religious ~ dptMiss + polMiss + mmrMiss + varMiss
## 
##           Df Sum of Sq   RSS    AIC
## - mmrMiss  1      28.3 50381 2994.8
## <none>                 50353 2996.5
## - polMiss  1     252.0 50605 2997.9
## - dptMiss  1     351.7 50704 2999.3
## - varMiss  1   15362.7 65715 3180.3
## 
## Step:  AIC=2994.84
## religious ~ dptMiss + polMiss + varMiss
## 
##           Df Sum of Sq   RSS    AIC
## <none>                 50381 2994.8
## - polMiss  1     223.9 50605 2995.9
## - dptMiss  1     410.5 50791 2998.5
## - varMiss  1   15526.3 65907 3180.4
stepOut_religious$anova
EnsurePackage("MCMCpack")
#histogram on residuals religious.lm
hist(residuals(religious.lm))

#Chi-Square analysis on logistic regression
anova(religious.lm, test="Chisq")
# Convert the log odds for the coefficient on the predictor into regular odds
exp(coef(religious.lm))
## (Intercept)     dptMiss     polMiss     mmrMiss     varMiss 
##   0.7814467   0.7812570   1.2560020   0.9540644   2.7153276
#confusion matrix
# table(round(predict(religious.lm,type="response")),reportSampleDF$religious)
options(scipen=999)  # turn-off scientific notation like 1e+48
EnsurePackage("BayesFactor")

MCMCout_religious <- lmBF(religious ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF, posterior=TRUE, iterations=10000)
summary(MCMCout_religious)
## 
## Iterations = 1:10000
## Thinning interval = 1 
## Number of chains = 1 
## Sample size per chain = 10000 
## 
## 1. Empirical mean and standard deviation for each variable,
##    plus standard error of the mean:
## 
##             Mean      SD  Naive SE Time-series SE
## mu       4.44532 0.32036 0.0032036      0.0032036
## dptMiss -0.24300 0.11225 0.0011225      0.0011225
## polMiss  0.22447 0.12251 0.0012251      0.0012251
## mmrMiss -0.04661 0.07621 0.0007621      0.0007621
## varMiss  0.99243 0.06819 0.0006819      0.0006819
## sig2    72.95072 3.87184 0.0387184      0.0393957
## g        0.34720 0.40820 0.0040820      0.0040820
## 
## 2. Quantiles for each variable:
## 
##             2.5%      25%      50%       75%    97.5%
## mu       3.80448  4.23195  4.44384  4.661998  5.07339
## dptMiss -0.46085 -0.31910 -0.24355 -0.167167 -0.02233
## polMiss -0.01387  0.14281  0.22462  0.306061  0.46359
## mmrMiss -0.19657 -0.09847 -0.04732  0.003765  0.10419
## varMiss  0.85783  0.94773  0.99312  1.038333  1.12667
## sig2    65.73168 70.29508 72.82818 75.473907 81.01892
## g        0.07924  0.15516  0.23848  0.389960  1.29311
rsqList <- 1 - (MCMCout_religious[,"sig2"] / var(reportSampleDF$religious))
MeanrsqList <- mean(rsqList)
MeanrsqList
## [1] 0.4779657
quantile(rsqList,c(0.025))
##      2.5% 
## 0.4202298
quantile(rsqList,c(0.975))
##     97.5% 
## 0.5296251
religiousBF <- lmBF(religious ~ dptMiss + polMiss + mmrMiss + varMiss ,data=reportSampleDF)
summary(religiousBF)
## Bayes factor analysis
## --------------
## [1] dptMiss + polMiss + mmrMiss + varMiss : 13924325715845296893236637991759097040796547804482437591555400957505265181385934113503608569856 ±0%
## 
## Against denominator:
##   Intercept only 
## ---
## Bayes factor type: BFlinearModel, JZS
# histogram of 95% HDI mean differences on dptMiss | overlaps with 0
hist(MCMCout_religious[,"dptMiss"],col="#CBB43D")
abline(v=quantile(MCMCout_religious[,"dptMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_religious[,"dptMiss"],c(0.975)),col="green")

# histogram of 95% HDI mean differences on polMiss | overlaps with 0
hist(MCMCout_religious[,"polMiss"],col="#4DAFD4")
abline(v=quantile(MCMCout_religious[,"polMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_religious[,"polMiss"],c(0.975)),col="green")

# histogram of 95% HDI mean differences on mmrMiss | overlaps with 0
hist(MCMCout_religious[,"mmrMiss"],col="pink")
abline(v=quantile(MCMCout_religious[,"mmrMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_religious[,"mmrMiss"],c(0.975)),col="green")

# histogram of 95% HDI mean differences on varMiss | doesnt overlap with 0
hist(MCMCout_religious[,"varMiss"],col="orange")
abline(v=quantile(MCMCout_religious[,"varMiss"],c(0.025)),col="blue")
abline(v=quantile(MCMCout_religious[,"varMiss"],c(0.975)),col="green")

summary(reportSampleDF)
##       code             name             pubpriv            enrollment    
##  Min.   : 100362   Length:698         Length:698         Min.   : 10.00  
##  1st Qu.:6015501   Class :character   Class :character   1st Qu.: 42.25  
##  Median :6043137   Mode  :character   Mode  :character   Median : 75.00  
##  Mean   :5588410                                         Mean   : 76.11  
##  3rd Qu.:6116305                                         3rd Qu.:105.00  
##  Max.   :7103161                                         Max.   :221.00  
##     allvaccs       conditional        medical          religious      
##  Min.   :  8.00   Min.   : 0.000   Min.   : 0.0000   Min.   :  0.000  
##  1st Qu.: 86.17   1st Qu.: 0.000   1st Qu.: 0.0000   1st Qu.:  0.000  
##  Median : 93.33   Median : 3.030   Median : 0.0000   Median :  1.087  
##  Mean   : 89.01   Mean   : 6.931   Mean   : 0.1759   Mean   :  4.449  
##  3rd Qu.: 97.46   3rd Qu.: 8.677   3rd Qu.: 0.0000   3rd Qu.:  4.353  
##  Max.   :100.00   Max.   :81.818   Max.   :14.2857   Max.   :162.500  
##     dptMiss          polMiss          mmrMiss          hepMiss      
##  Min.   : 0.000   Min.   : 0.000   Min.   : 0.000   Min.   : 0.000  
##  1st Qu.: 1.699   1st Qu.: 1.504   1st Qu.: 1.429   1st Qu.: 0.722  
##  Median : 4.918   Median : 4.587   Median : 5.042   Median : 3.073  
##  Mean   : 8.638   Mean   : 8.229   Mean   : 8.793   Mean   : 5.909  
##  3rd Qu.:10.960   3rd Qu.:10.507   3rd Qu.:11.111   3rd Qu.: 7.767  
##  Max.   :81.250   Max.   :81.250   Max.   :81.250   Max.   :79.070  
##     varMiss       isPublicPrivate       val   
##  Min.   : 0.000   Min.   :0.0000   Min.   :1  
##  1st Qu.: 0.000   1st Qu.:1.0000   1st Qu.:1  
##  Median : 2.485   Median :1.0000   Median :1  
##  Mean   : 5.371   Mean   :0.7822   Mean   :1  
##  3rd Qu.: 6.667   3rd Qu.:1.0000   3rd Qu.:1  
##  Max.   :79.070   Max.   :1.0000   Max.   :1

Predictive Analyses:

13. What’s the big picture, based on all of the foregoing analyses?